Frank would like a heatmap of some MS1 data, clustered on
Guy11 samples only and scaled by row in the range 0 -
1.
library(readxl)
library(here)
## here() starts at /Users/jegoussc/GitHub/PD_PRO_1499_06122021_PRM2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(readr)
file <- here("raw/Nrc210121_rgm2r.xlsx" )
cleaned <- read_excel(file, sheet=excel_sheets(file)) |>
select(`Positions in Master Proteins`, `Modifications`, starts_with("Abundances (Grouped):")) |>
pivot_longer(cols = -c(`Positions in Master Proteins`, `Modifications`), names_to = "sample") |>
mutate(
sample = stringr::str_replace(sample, "Abundances \\(Grouped\\): ", "")
) |>
separate(sample, c('time', 'sample'), sep=", ",convert=TRUE) |>
distinct()
write_csv(cleaned, here("cleaned/ms1_cleaned.csv"))
The long dataframe needs pivot-ing into a matrix. Scaling is applied too.
m <- cleaned |>
mutate(sample_time = paste(sample, time, sep="-"),
id = paste(`Positions in Master Proteins`, `Modifications`, sep="-")
) |>
select(id, sample_time, value) |>
pivot_wider(names_from = "sample_time", values_from = "value" )
rows <- m$id
m$id <- NULL
raw_m <- as.matrix(m)
## Now scale
m <- t(apply(raw_m, 1, function(x)(x-min(x, na.rm=TRUE))/(max(x, na.rm=TRUE)-min(x,na.rm=TRUE))))
Guy11 columns onlydist calculationThe data are a little sparse, Frank has filtered to ensure there are always at least five measurements out of 12. This is sensible but the distance measure for clustering can’t be computed unless there are at least two overlapping values in the pairs of proteins (row data) being compared (meaning I think that this threshold should be 8). For now I get round this by replacing the NA for pairs without two overlapping values with the maximum distance in the whole distance matrix About 92000 pairs of 36million are affected.
The cluster is plotted briefly.
guy11 <- c('Guy11-0', 'Guy11-1', 'Guy11-1.5', 'Guy11-2', 'Guy11-4', 'Guy11-6')
dpmk <- c('dpmk1-0', 'dpmk1-1', 'dpmk1-1.5', 'dpmk1-2', 'dpmk1-4','dpmk1-6')
#just do the guy11
gm <- m[,guy11]
d <- dist(gm)
d[which(is.na(d))] <- max(d, na.rm = TRUE)
hc <- hclust(d)
row_order <- hc$order
plot(hc)
The heatmap is drawn with this ordering. Note the grey NA values not mentioned in the scale
pal <- viridis::viridis_pal()
colours <- pal(11)
## draw!
ComplexHeatmap::Heatmap(m, column_order = c(guy11, dpmk),
row_order = row_order,
col = colours,
use_raster=FALSE,
heatmap_legend_param = list(title = "Scaled Abundance")
)
We can also replace NA with the minimum observed value on the whole abundance data - before scaling and distance measuring (don’t know whether this is legit this time - Frank can illuminate)
m <- raw_m
m[is.na(m)] <- min(m, na.rm=TRUE)
## Now scale
m <- t(apply(m, 1, function(x)(x-min(x, na.rm=TRUE))/(max(x, na.rm=TRUE)-min(x,na.rm=TRUE))))
Now cluster
#just do the guy11
gm <- m[,guy11]
d <- dist(gm)
hc <- hclust(d)
row_order <- hc$order
plot(hc)
The heatmap is drawn with this ordering
pal <- viridis::viridis_pal()
colours <- pal(11)
## draw!
ComplexHeatmap::Heatmap(m, column_order = c(guy11, dpmk),
row_order = row_order,
col = colours,
use_raster=FALSE,
heatmap_legend_param = list(title = "Scaled Abundance")
)
Frank would like to kmeans cluster the matrix. First by
Guy11. We can do this with the data of replacements of the
missing values with the minimal observed values (not the replacements of
the distances with the maximum distance)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_nbclust(gm, kmeans, method="wss", k.max = 50)
I’m going to go with 5 clusters, because the drop off after that is slight. Though its not super clear.
Can’t easily add in the dpmk1 stuff for the
summary..
set.seed(1044)
kmeans_clust <- kmeans(gm, 5, nstart = 25, iter.max=1000)
ComplexHeatmap::Heatmap(kmeans_clust$centers,column_order = guy11, row_order = 1:5,
col = colours,
use_raster=FALSE,
heatmap_legend_param = list(title = "Scaled Abundance")
)
Guy11 kmeans clustered
Guy11 and dpmk1 datamake_hmap <- function(mat, km, col_order, colours) {
r <- list(vector(mode = "list", length = max(km$cluster)))
for (i in 1:max(km$cluster)){
rnames <- which(km$cluster == i)
tm <- mat[rnames,]
if (length(rnames) == 1){
tm <- t(as.matrix(tm))
rownames(tm) <- rnames
}
r[[i]] <- ComplexHeatmap::Heatmap(tm,column_order = col_order, name = paste("Cluster", i),
col = colours,
use_raster=FALSE,
heatmap_legend_param = list(title = "Scaled Abundance") )
}
return(r)
}
hml <- make_hmap(m, kmeans_clust, c(guy11, dpmk), colours)
for (i in 1:max(kmeans_clust$cluster)){
ComplexHeatmap::draw(hml[[i]],padding= grid::unit(c(0,0,0,1.5), "in"))
}
make_csv <- function(mat,km,rows) {
for (i in 1:max(km$cluster)){
rnames <- which(km$cluster == i)
tm <- mat[rnames,]
if (length(rnames) == 1){
tm <- t(as.matrix(tm))
rownames(tm) <- rnames
}
file <- here("analysis", paste("010_kmeans_cluster_", i, "_on_guy11.csv"))
df <- as.data.frame(tm)
df$id = rows[rnames]
write_csv(df, file)
}
}
make_csv(m, kmeans_clust,rows)
Similar to above but allowing all matrix columns to be in the kmeans.
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_nbclust(m, kmeans, method="wss", k.max = 50)
Again, this seems to be about 5 clusters, but it is noisy.
set.seed(1044)
kmeans_clust <- kmeans(m, 5, nstart = 25, iter.max=1000)
ComplexHeatmap::Heatmap(kmeans_clust$centers,column_order = c(guy11, dpmk), row_order = 1:5,
col = colours,
use_raster=FALSE,
heatmap_legend_param = list(title = "Scaled Abundance")
)
Guy11 and dpmk1
kmeans-clustered Guy11 and dpmk1 datahml <- make_hmap(m, kmeans_clust, c(guy11, dpmk), colours)
for (i in 1:max(kmeans_clust$cluster)){
ComplexHeatmap::draw(hml[[i]],padding= grid::unit(c(0,0,0,1.5), "in"))
}
make_csv <- function(mat,km,rows) {
for (i in 1:max(km$cluster)){
rnames <- which(km$cluster == i)
tm <- mat[rnames,]
if (length(rnames) == 1){
tm <- t(as.matrix(tm))
rownames(tm) <- rnames
}
file <- here("analysis", paste("010_kmeans_cluster_", i, "_on_guy11_and_dpmk1.csv"))
df <- as.data.frame(tm)
df$id = rows[rnames]
write_csv(df, file)
}
}
make_csv(m, kmeans_clust,rows)
CJ: Quick update on the heatmap as requested by Neftaly
Colours
pal <- viridis::viridis_pal()
colours <- pal(11)
Dendrogram for the row (proteins) clustering
row_dend = as.dendrogram(hc)
Factor to set the order of the columns
col_split = factor(stringr::str_split(colnames(m), "-", n = 2, simplify = TRUE)[,1], levels = c("Guy11", "dpmk1"))
Labels (times) for the columns
col_labels = c('0h00', '0h00',
'1h00', '1h00', '1h30', '1h30',
'2h00', '2h00', '4h00', '4h00',
'6h00', '6h00')
Factor to set the order of the clusters (according to Neftaly: 2,3,4,5,1):
row_split = factor(kmeans_clust$cluster, levels=c("2","3","4","5","1"))
## draw!
# heatmap-MS1-v1-basic
h1 = ComplexHeatmap::Heatmap(m,
column_order = c(guy11, dpmk),
row_order = row_order,
column_split = col_split,
column_title = c('Guy11', expression(paste(Delta,"pmk1"))),
column_gap = grid::unit(0.25, "cm"),
col = colours,
height=grid::unit(10, "cm"),
width=grid::unit(5, "cm"),
use_raster=FALSE,
heatmap_legend_param = list(title = "Scaled\nAbundance",
legend_direction = "vertical",
legend_height = grid::unit(5, "cm")
),
column_names_gp = grid::gpar(fontsize = 10),
row_names_gp = grid::gpar(fontsize = 5),
column_labels = col_labels
)
# heatmap-MS1-v2-clusters
h2 = ComplexHeatmap::Heatmap(m,
column_order = c(guy11, dpmk),
cluster_rows = row_dend,
row_dend_width = grid::unit(0.1, "cm"),
row_dend_gp = grid::gpar(col = "white"),
row_split = 5,
column_split = col_split,
column_title = c('Guy11', expression(paste(Delta,"pmk1"))),
column_gap = grid::unit(0.25, "cm"),
col = colours,
height=grid::unit(10, "cm"),
width=grid::unit(5, "cm"),
use_raster=FALSE,
heatmap_legend_param = list(title = "Scaled\nAbundance",
legend_direction = "vertical",
legend_height = grid::unit(5, "cm")
),
column_names_gp = grid::gpar(fontsize = 10),
row_names_gp = grid::gpar(fontsize = 5),
column_labels = col_labels
)
h2
# heatmap-MS1-v3-clusters-with-dend
h3 = ComplexHeatmap::Heatmap(m,
column_order = c(guy11, dpmk),
cluster_rows = row_dend,
row_dend_width = grid::unit(4, "cm"),
row_dend_gp = grid::gpar(col = "black"),
row_split = 5,
column_split = col_split,
column_title = c('Guy11', expression(paste(Delta,"pmk1"))),
column_gap = grid::unit(0.25, "cm"),
col = colours,
height=grid::unit(10, "cm"),
width=grid::unit(5, "cm"),
use_raster=FALSE,
heatmap_legend_param = list(title = "Scaled Abundance",
legend_direction = "vertical",
legend_height = grid::unit(5, "cm")
),
column_names_gp = grid::gpar(fontsize = 10),
row_names_gp = grid::gpar(fontsize = 5),
column_labels = col_labels
)
h3
# heatmap-MS1-v4-time-comp
h4 = ComplexHeatmap::Heatmap(m,
column_order = c(guy11, dpmk),
#cluster_rows = row_dend,
row_dend_width = grid::unit(0.1, "cm"),
row_dend_gp = grid::gpar(col = "white"),
row_split = row_split,
cluster_row_slices = FALSE,
#row_order = row_order,
column_split = col_split,
column_title = c('Guy11', expression(paste(Delta,"pmk1"))),
column_title_gp = grid::gpar(fontsize = 12, fontface = "bold"),
#column_gap = grid::unit(0.25, "cm"),
col = colours,
# 25 cm is the maximum to print in portrait orientation on A4
height=grid::unit(10, "cm"), # change to 10 to visualise in RStudio
width=grid::unit(5, "cm"),
use_raster=FALSE,
heatmap_legend_param = list(title = "Scaled\nAbundance",
legend_direction = "vertical",
legend_height = grid::unit(5, "cm")
),
column_names_gp = grid::gpar(fontsize = 8),
row_title_gp = grid::gpar(fontsize = 10),
column_labels = col_labels
)
h4